home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
MACD 5
/
MACD 5.bin
/
workbench
/
tools
/
czesc_2
/
makewords
/
source
/
dicttotables.c
< prev
next >
Wrap
C/C++ Source or Header
|
1990-08-12
|
9KB
|
328 lines
/* DictToTables.c - convert an ASCII dictionary file into a file of two tables
containing the frequency of occurrence of each trigram in the dictionary.
"Words" in the dictionary are groups of 3 to MAXLEN letters. All other
characters are ignored. Thus you may use any text file that contains
a variety (the bigger the better) of words. You could even use a
Latin text to generate frequency tables for Latin.
The trigram frequencies are first stored in an array ( nn[][][] ),
then put in the two tables, then the two tables are written to a file.
The file, after compilation, may be linked with Unjumble.o or PhoneWords.o
The two tables are a bit table and a nybble table, wherein the frequencies
have been scaled from 'short int' to nybble (4 bits) size.
The bit and nybble tables together allow the trigram frequencies to be
reconstructed. If the frequency for a trigram is non-zero then the
corresponding bit in the bit table is set. The frequency for that trigram
is stored as the next nybble in the nybble table. This strategy requires only
the non-zero frequencies be stored, with the bit table showing the
position of the non-zero trigrams. With about 4000 non-zero trigrams only
about 4800 bytes are required to store the trigram data.
dictionary -> words -> nn[][][] -> bit_table/nybble_table -> file
Ron Charlton
9002 Balcor Circle
Knoxville, TN 37923
Phone: (615)694-0800
PLINK: R*CHARLTON
BINTNET: charltr@utkvx1
05-Jul-90
This program is in the public domain and may be used for any purpose.
Usage: 1> DictToTables
*/
#include <ctype.h>
#include <stdio.h>
#include <assert.h>
#define MAXLEN 15
#define MAXBUCKETS 2000
#define TRUE 1
#define FALSE 0
#define BIT_TABLE_SIZE (26 * 26 * 26 / 32 + 1)
unsigned long bit_table [ BIT_TABLE_SIZE ]; /* the bit table */
unsigned long *nybble_table; /* the nybble table (allocated later) */
int nybble_table_size;
unsigned short nn[27][27][27]; /* array of trigram frequencies */
short bucket [ MAXBUCKETS ];
int nonzero; /* how many trigrams exist */
char infile [ 81 ] = "dict.txt";
char outfile [ 81 ] = "tables.c";
char name [ 81 ];
char wordbuf [ 256 ];
FILE *in, *out, *fopen();
main()
{
int len;
unsigned long *calloc();
printf ("Convert an ASCII dictionary file to one containing two tables\n"
"encoding trigram weights (frequencies). The input file should\n"
"many different words in the desired language.\n\n" );
/* get file names and open them */
do_infile();
do_outfile();
printf ( "analyzing dictionary...\n" );
/* count trigrams in "words" from infile */
while ( len = getword () )
if ( len >= 3 && len <= MAXLEN )
count_trigrams_in_word ( wordbuf );
close ( in );
printf ( "counting trigrams...\n" );
do_statistics();
nybble_table_size = nonzero / 8 + 1; /* how many longs in table */
nybble_table = ( unsigned long *)
calloc ( (size_t) nybble_table_size,
(size_t) sizeof ( long )); /* allocate */
if ( nybble_table == NULL )
{
printf ( "Can't allocate memory for nybble_table.\n" );
exit ( 20 );
}
printf ( "Building tables...\n" );
build_tables(); /* make bit and nybble tables */
printf ( "writing tables to file...\n" );
nybble_table_to_file(); /* store the nybble table */
bit_table_to_file(); /* store the bit table */
close ( out );
print_results();
}
/*--------------------------------------------------------------------*/
count_trigrams_in_word ( w )
/* store count of each trigram in nn[][][] */
char *w;
{
register int len, pos;
if ( ( len = strlen ( w ) ) >= 3 )
for ( pos = 0; pos <= len - 3; pos++ )
++ nn [ w[pos] - '@' ] [ w[pos+1] - '@' ] [ w[pos+2] - '@' ];
}
/*--------------------------------------------------------------------*/
int getword ()
/* get a word (contiguous group of letters) from infile & make uppercase.
RETURNS - length of word in bytes */
{
register int inword = FALSE, pos = 0, c;
while ( ( c = getc ( in ) ) != EOF )
{
c = toupper ( c );
if ( isalpha ( c ) )
{
inword = TRUE;
wordbuf [ pos++ ] = (char) c;
}
else if ( inword )
{
wordbuf [ pos ] = '\0';
return ( strlen ( wordbuf ) );
}
}
wordbuf [ pos ] = '\0';
return ( strlen ( wordbuf ) );
}
/*--------------------------------------------------------------------*/
do_statistics()
/* count nonzero trigram values, etc. */
{
register int i, j, k;
for ( i = 1; i <= 26; i++ )
for ( j = 1; j <= 26; j++ )
for ( k = 1; k <= 26; k++ )
if ( nn[i][j][k] ) /* filter out zeroes */
{
++nonzero;
++bucket [ nn[i][j][k] ];
}
}
/*------------------------------------------------------------------------*/
print_results()
/* print results */
{
register int i;
for ( i = 0; i < MAXBUCKETS; i++ )
if ( bucket [ i ] )
printf ( "%4d trigrams occurred %4d times\n", bucket [i], i );
printf ( "non-zero trigrams = %d\n", nonzero );
}
/*-----------------------------------------------------------------------*/
do_infile()
{
printf ( "Enter input file name [%s]: ", infile );
gets ( name );
if ( strlen ( name ) > 0 )
strcpy ( infile, name );
if ( ( in = fopen ( infile, "r" ) ) == NULL )
{
printf ( "Can't open %s for input\n", infile );
exit ( 19 );
}
}
/*-----------------------------------------------------------------------*/
do_outfile()
{
printf ( "Enter output file name [%s]: ", outfile );
gets ( name );
if ( strlen ( name ) > 0 )
strcpy ( outfile, name );
if ( ( out = fopen ( outfile, "w" ) ) == NULL )
{
printf ( "Can't open %s for output\n", outfile );
exit ( 20 );
}
}
/*------------------------------------------------------------------------*/
short log2 ( n )
/* convert short to fit in a nybble by taking log base 2 (truncated):
n nonzero - return log base 2 (truncated) of n.
n zero - return 0 */
unsigned short n;
{
register int log = 0;
while ( n )
{
++log;
n >>= 1;
}
assert ( log < 16 ); /* so as to fit in a nybble */
return ( log );
}
/*------------------------------------------------------------------------*/
build_tables()
/* use nn[][][] to build the bit and nybble tables */
{
register int ptr = 0, i, j, k;
for ( i = 1; i <= 26; i++ )
for ( j = 1; j <= 26; j++ )
for ( k = 1; k <= 26; k++ )
if ( nn [ i ] [ j ] [ k ] )
{
store_nybble ( ptr++, log2 ( nn [ i ] [ j ] [ k ] ) );
set_bit ( ( i - 1 ) * 26*26 + ( j - 1 ) * 26 + ( k - 1 ) );
}
}
/*------------------------------------------------------------------------*/
store_nybble ( nyb_number, value )
/* put value in nybble pointed to by nyb_number */
int nyb_number, value;
{
int longword, nybble_in_longword;
nybble_in_longword = nyb_number & 7; /* 3 LSBits pick nybble position */
longword = nyb_number >> 3; /* which longword in table */
assert ( longword < nybble_table_size );
nybble_table [ longword ] |=
(long)value << ( nybble_in_longword * 4 ); /* store it */
}
/*------------------------------------------------------------------------*/
set_bit ( n )
/* set bit n in the bit table of 32 bit ints */
int n;
{
int bit, longword;
longword = n >> 5; /* which longword in table */
bit = n & 0x1F; /* bit 0 - 31 in longword */
assert ( longword < BIT_TABLE_SIZE );
assert ( bit < 32 );
bit_table [ longword ] |= ( 1L << bit ); /* set the bit */
}
/*------------------------------------------------------------------------*/
nybble_table_to_file()
/* store the nybble table in the file */
{
int longword;
fprintf ( out, "/* nybble table of trigram frequencies */\n\n" );
fprintf ( out, "unsigned long nybble_table [] = { 0x%lx\n",
nybble_table [ 0 ] );
for ( longword = 1; longword < nybble_table_size; longword++ )
{
fprintf ( out, ",0x%lx", nybble_table [ longword ] );
if ( ( longword % 6 ) == 0 )
fprintf ( out, "\n" );
}
fprintf ( out, "\n\t};\n\n" );
fprintf ( out, "int nybble_table_size = %d;\n", nybble_table_size );
}
/*--------------------------------------------------------------------*/
bit_table_to_file()
/* store the bit table in the file */
{
int longword;
fprintf ( out, "\n\n /* = = = = = = = = = = = = = = = = = = */ \n\n" );
fprintf ( out, "/* bit table of trigram frequencies */\n\n" );
fprintf ( out, "unsigned long bit_table [] = { 0x%lx\n", bit_table [ 0 ] );
for ( longword = 1; longword < BIT_TABLE_SIZE; longword++ )
{
fprintf ( out, ",0x%lx", bit_table [ longword ] );
if ( ( longword % 6 ) == 0 )
fprintf ( out, "\n" );
}
fprintf ( out, "\n\t};\n\n" );
fprintf ( out, "int bit_table_size = %d;\n", BIT_TABLE_SIZE );
}
/*--------------------------- END OF FILE --------------------------------*/